# importing necessary libraries
library(tidyverse)
library(knitr)
library(readxl)
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
options(scipen = 4)

Introduction

# importing the dataset
ltv <- read_excel("C:/Users/shiri/Dropbox/CMU - 2nd sem/Data Mining/Project/ltv Dataset.xlsx", 1)

# dimension of the dataset
dim(ltv)
## [1] 403835      9
# names of columns in the dataset
colnames(ltv)
## [1] "id"        "status"    "gender"    "date"      "pages"     "onsite"   
## [7] "entered"   "completed" "holiday"
# checking the first 10 rows of the dataset
kable(head(ltv, 10))
id status gender date pages onsite entered completed holiday
1 0 M 2014-10-31 7 3 1 1 0
1 1 M 2014-11-01 6 8 0 0 0
1 1 M 2014-11-02 6 20 1 0 0
1 1 M 2014-12-15 1 1 0 0 0
1 1 M 2014-12-19 1 1 0 0 0
1 2 M 2014-12-23 0 0 0 0 0
2 0 F 2013-09-07 7 23 1 1 0
2 1 F 2013-09-08 9 30 1 1 0
2 1 F 2013-09-09 7 19 1 1 0
2 1 F 2013-09-11 8 3 1 0 0

Exploratory Data Analysis

Describing variables

# getting a descriptive summary of the dataset
kable(summary(ltv))
id status gender date pages onsite entered completed holiday
Min. : 1 Min. :0.0000 Length:403835 Min. :2011-01-01 00:00:00 Min. : 0.000 Min. : 0.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.: 2537 1st Qu.:1.0000 Class :character 1st Qu.:2012-06-29 00:00:00 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median : 5025 Median :1.0000 Mode :character Median :2013-05-04 00:00:00 Median : 5.000 Median : 5.000 Median :1.0000 Median :1.0000 Median :0.0000
Mean : 5017 Mean :0.9909 NA Mean :2013-04-10 19:01:17 Mean : 5.018 Mean : 8.831 Mean :0.7821 Mean :0.5627 Mean :0.2277
3rd Qu.: 7495 3rd Qu.:1.0000 NA 3rd Qu.:2014-01-26 00:00:00 3rd Qu.: 7.000 3rd Qu.: 11.000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
Max. :10000 Max. :2.0000 NA Max. :2014-12-31 00:00:00 Max. :10.000 Max. :220.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
# checking for NA values
sum(is.na(ltv))
## [1] 0
# converting numeric variables into factors and date into date
ltv$status <- as.factor(ltv$status)
ltv$entered <- as.factor(ltv$entered)
ltv$completed <- as.factor(ltv$completed)
ltv$holiday <- as.factor(ltv$holiday)
ltv$date <- as.Date(ltv$date)

# getting distinct gender count
gender_count <- ltv %>%
  group_by(gender) %>%
  summarize(n_distinct(id))

kable(gender_count, col.names = c("Gender", "# Unique Customers"))
Gender # Unique Customers
F 7924
M 2076
# getting current status or the last status in the dataset of every customer
current_status_df <- ltv %>%
  group_by(id) %>%
  arrange(id) %>%
  filter(row_number() == n())

kable(table(current_status_df$status), col.names = c("Current Status", "# Unique Customers"))
Current Status # Unique Customers
0 2
1 3681
2 6317

Univariate Analysis

# plotting histograms to get an idea of distribution of different variables
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.6.3
p1 <- ggplot(data = ltv, aes(x = date, fill = I("lightblue"))) +
  geom_histogram() +
  xlab("Date of login") +
  theme_bw() +
  geom_vline(aes(xintercept = mean(date)), col = 'red')

p2 <- ggplot(data = ltv, aes(x = pages, fill = I("lightblue"))) +
  geom_histogram() +
  xlab("# Pages visited by the user in a session") +
  theme_bw() +
  geom_vline(aes(xintercept = mean(pages)), col = 'red')

p3 <- ggplot(data = ltv, aes(x = onsite, fill = I("lightblue"))) +
  geom_histogram() +
  xlab("Minutes spent by the user on the website") +
  theme_bw() +
  geom_vline(aes(xintercept = mean(onsite), col = 'red'))

grid.arrange(p1, p2, p3)

# checking for outliers in the variable onsite
p4 <- ggplot(data = ltv, aes(y = onsite, fill = I("lightblue"))) +
  geom_boxplot() +
  xlab("Minutes spent by the user on the website") +
  theme_bw()

ggplotly(p4)
# log transforming onsite variable
ltv$log.onsite <- log(1 + ltv$onsite)

# re-checking the distribution
p5 <- ggplot(data = ltv, aes(x = log.onsite, fill = I("lightblue"))) +
  geom_histogram() +
  xlab("Log of minutes spent by the user on the website") +
  theme_bw() +
  geom_vline(aes(xintercept = mean(log.onsite)), col = 'red')

ggplotly(p5)
# number of events per customer id
events.per.customer <- ltv %>%
  group_by(id) %>%
  summarize(events = n())

# plotting a histogram of events per customer id
p6 <- ggplot(data = events.per.customer, aes(x = events, fill = I("lightblue"))) +
  geom_histogram() +
  theme_bw() +
  labs(x = "# Events per Customer ID", y = "Count of Customers") +
  geom_vline(aes(xintercept = mean(events)), col = 'red')

ggplotly(p6)

Bivariate Analyses

# engineering new variables days from registration as a new customer
registration_date <- ltv[c("id", "date")] %>%
  group_by(id) %>%
  arrange(id, date) %>%
  filter(row_number() == 1)

colnames(registration_date) <- c("id", "registration.date")

# merging ltv and registration_date
ltv.1 <- merge(ltv, registration_date)

# subtracting registration.date from date to get days.since.registration
ltv.1$days.since.registration <- as.numeric(ltv.1$date - ltv.1$registration.date)

# computing days since last visit
ltv.1 <- ltv.1 %>%
  group_by(id) %>%
  arrange(date, .by_group = TRUE) %>%
  mutate(days.since.last.visit = as.numeric(date - lag(date, default = first(date))))

# checking the first 10 values
kable(head(ltv.1, 10))
id status gender date pages onsite entered completed holiday log.onsite registration.date days.since.registration days.since.last.visit
1 0 M 2014-10-31 7 3 1 1 0 1.3862944 2014-10-31 0 0
1 1 M 2014-11-01 6 8 0 0 0 2.1972246 2014-10-31 1 1
1 1 M 2014-11-02 6 20 1 0 0 3.0445224 2014-10-31 2 1
1 1 M 2014-12-15 1 1 0 0 0 0.6931472 2014-10-31 45 43
1 1 M 2014-12-19 1 1 0 0 0 0.6931472 2014-10-31 49 4
1 2 M 2014-12-23 0 0 0 0 0 0.0000000 2014-10-31 53 4
2 0 F 2013-09-07 7 23 1 1 0 3.1780538 2013-09-07 0 0
2 1 F 2013-09-08 9 30 1 1 0 3.4339872 2013-09-07 1 1
2 1 F 2013-09-09 7 19 1 1 0 2.9957323 2013-09-07 2 1
2 1 F 2013-09-11 8 3 1 0 0 1.3862944 2013-09-07 4 2
# to do customer segmenting, creating a new dataframe with "recency" variable
# Recency = latest date of purchase overall - latest day of purchase of active customers (i.e. status == "1")

ltv.recency <- ltv.1 %>%
  group_by(id) %>%
  summarize(status = max(as.numeric(status)), recency = as.numeric(as.Date("2014-12-31") - max(date))) %>%
  filter(status == 1 | status == 2)

ltv.recency <- subset(ltv.recency, select = -c(status))

# checking the first 10 rows of the dataframe ltv.recency
kable(head(ltv.recency, 10))
id recency
3 87
4 9
6 12
7 88
10 1
11 11
12 11
15 14
18 13
19 184
# customers with high recency can be considered as sleeping customers, medium recency can be considered as active, 
# and low recency can be considered as new 

# plotting a histogram to see the distribution of recency
p7 <- ggplot(data = ltv.recency, aes(x = recency, fill = I("lightblue"))) +
  geom_histogram(binwidth = 10) +
  theme_bw() +
  geom_vline(aes(xintercept = mean(recency)), col = "red")

ggplotly(p7)
# adding more features for clustering
ltv.agg <- ltv.1 %>%
  group_by(id) %>%
  summarize(status = max(as.numeric(as.character(status))), is.female = ifelse(max(gender) == "F", 1, 0), 
            mean.pages = round(mean(pages), 2), mean.log.onsite = round(mean(log.onsite), 2), 
            prop.entered = round(sum(as.numeric(as.character(entered)))/n(), 2),
            prop.completed = round(sum(as.numeric(as.character(completed)))/n(), 2),
            recency = as.numeric(as.Date("2014-12-31") - max(date)), 
            frequency = round(mean(days.since.last.visit), 2)) %>%
  filter(status == 0 | status == 1)

ltv.agg <- subset(ltv.agg, select = -c(status))

# checking the first 10 rows
kable(head(ltv.agg, 10))
id is.female mean.pages mean.log.onsite prop.entered prop.completed recency frequency
3 1 4.93 1.77 0.66 0.59 87 16.41
4 1 5.79 1.97 0.75 0.67 9 22.00
6 0 6.14 2.01 1.00 1.00 12 29.71
7 1 5.30 2.00 0.83 0.57 88 22.96
10 1 6.75 2.12 0.88 0.75 1 4.88
11 1 4.99 1.95 0.79 0.48 11 12.20
12 1 4.92 1.98 0.77 0.52 11 11.12
15 1 5.03 1.71 0.82 0.57 14 10.43
18 1 5.85 2.05 0.85 0.69 13 10.00
19 1 4.98 1.82 0.67 0.49 184 14.65
# checking correlation between features
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    txt <- format(c(r, 0.123456789), digits = digits)[1]
    txt <- paste0(prefix, txt)
    if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5, txt, cex = pmax(1, cex.cor * r))
}

var.names <- c("is.female", "mean.pages", "mean.log.onsite", "prop.entered", "prop.completed", "recency", "frequency")
pairs(ltv.agg[,var.names], lower.panel = panel.cor)

# removing mean.pages and prop.entered from the dataset as they are highly correlated with prop.completed
ltv.agg <- subset(ltv.agg, select = -c(mean.pages, prop.entered))

# running a loop of k-means clustering to determine the optimum number of k by elbow method
set.seed(0)

withinss <- rep(0, 10)

for (i in 1:10){
  clusters <- kmeans(ltv.agg[, 2:6], i)
  
  withinss[i] <- clusters$tot.withinss
}

plot(x = 1:10, y = withinss, ylab = "Sum of Squares", xlab = "# of clusters", type = "l")

# using k-means clustering with k = 3 only with the variable "recency"
set.seed(1)
ltv.recency <- cbind(ltv.agg)
clusters <- kmeans(ltv.recency$recency, 3)

# sorting cluster number
centers <- sort(clusters$centers)

# again clustering with sorted centers
clusters.new <- kmeans(ltv.recency$recency, centers)

ltv.recency$cluster <- as.factor(clusters.new$cluster)

# descriptive statistics of the created three clusters
table.recency <- ltv.recency %>%
  group_by(cluster) %>%
  summarize(count = n(), mean.recency = mean(recency), zero.percentile = min(recency), 
            median.recency = median(recency), hundred.percentile = max(recency))

kable(table.recency)
cluster count mean.recency zero.percentile median.recency hundred.percentile
1 3411 15.27089 0 10 136
2 191 261.54974 140 238 473
3 81 712.39506 488 657 1104
# using k-means clustering with k = 3
set.seed(2)
ltv.all <- cbind(ltv.agg)
ltv.all.scale <- scale(ltv.all[, 2:6])
clusters <- kmeans(ltv.all.scale, 3)

ltv.all$cluster <- as.factor(clusters$cluster)

# descriptive statistics of the created three clusters
table.all.var <- ltv.all %>%
  group_by(cluster) %>%
  summarize(count = n(), mean.recency = mean(recency), zero.percentile = min(recency), 
            median.recency = median(recency), hundred.percentile = max(recency))

kable(table.all.var)
cluster count mean.recency zero.percentile median.recency hundred.percentile
1 2786 25.595477 0 10.0 328
2 743 8.497981 0 7.0 201
3 154 533.285714 17 508.5 1104